import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from scipy.stats import zscore
from scipy.spatial.distance import cdist
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from sklearn import svm
from sklearn.decomposition import PCA, IncrementalPCA
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
%matplotlib inline
sns.set(color_codes=True)
df1=pd.read_csv('Part1 - Car name.csv')
print('The size of the data:', df1.size)
print('No of rows in the data:', df1.shape[0])
print('No of columns in the data:', df1.shape[1])
df2=pd.read_json('Part1 - Car-Attributes.json')
print('The size of the data:', df2.size)
print('No of rows in the data:', df2.shape[0])
print('No of columns in the data:', df2.shape[1])
df3=df1.join(df2,how='left')
df3.head(10)
print('The size of the data:', df3.size)
print('No of rows in the data:', df3.shape[0])
print('No of columns in the data:', df3.shape[1])
df3.to_csv('Merge_data.csv')
df=pd.read_csv('Merge_data.csv', index_col=0)
df.head(10)
print('The size of the data:', df.size)
print('No of rows in the data:', df.shape[0])
print('No of columns in the data:', df.shape[1])
df2=df.copy(deep=True)
df2.drop(axis=1, columns='car_name', inplace=True)
df2.columns
df2 = df2.convert_objects(convert_numeric=True)
df2.isnull().sum()
medianFiller = lambda x: x.fillna(x.median())
df2 = df2.apply(medianFiller,axis=0)
df2.isnull().sum()
(df2==0).all()
correlation_values=df2.corr()['mpg']
correlation_values.abs().sort_values(ascending=False)
plt.figure(figsize = (15,7))
plt.title('Correlation of Attributes', y=1.05, size=19)
sns.heatmap(df2.corr(), cmap='plasma',annot=True, fmt='.2f')
df_scaled=df2.apply(zscore)
df_scaled.nunique()
df2.describe().T
five_point=df2.describe().T
five_point[['min','25%','50%','75%','max']]
sns.pairplot(df2, diag_kind='kde')
sns.countplot(df2['origin'])
sns.countplot(df2['cyl'])
sns.distplot(df2['mpg'])
sns.boxplot(df2['mpg'])
sns.scatterplot(df2['mpg'],df2['wt'], hue=df2['origin'])
sns.scatterplot(df2['mpg'],df2['wt'], hue=df2['cyl'])
sns.swarmplot(df2['origin'],df2['mpg'], hue= df2['cyl'])
sns.jointplot(df2['mpg'],df2['disp'], hue=df2['origin'])
sns.jointplot(df2['mpg'],df2['hp'], hue=df2['cyl'])
df_scaled.isnull().sum()
#Finding optimal no. of clusters
clusters=range(1,10)
meanDistortions=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(df_scaled)
prediction=model.predict(df_scaled)
meanDistortions.append(sum(np.min(cdist(df_scaled, model.cluster_centers_, 'euclidean'), axis=1)) / df_scaled.shape[0])
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
t1=datetime.datetime.now()
final_model=KMeans(3)
final_model.fit(df_scaled)
prediction=final_model.predict(df_scaled)
t2=datetime.datetime.now()
print('Time taken for KMeans clustering to fit and predict:', t2-t1)
df3=df.copy(deep =True)
df_scaled_Kmeans=df_scaled.copy(deep=True)
df3['GROUP']= prediction
df_scaled_Kmeans["GROUP"] = prediction
print("Groups Assigned : \n")
df3.head()
df3Clust = df3.groupby(['GROUP'])
df3Clust.mean()
df4=df.copy(deep =True)
df_scaled_HClustering=df_scaled.copy(deep=True)
t3=datetime.datetime.now()
model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='average')
model.fit(df_scaled_HClustering)
labels = model.labels_
t4=datetime.datetime.now()
print('The time taken for Hierarchy Clustering to fit and predict:', t4-t3)
df4['labels'] = labels
df4.head(10)
df4 = df4.groupby(['labels'])
df4.mean()
Z_average = linkage(df_scaled_HClustering, metric='euclidean', method='average')
c, coph_dists = cophenet(Z_average , pdist(df_scaled_HClustering))
print(c)
Z_complete = linkage(df_scaled_HClustering, metric='euclidean', method='complete')
c, coph_dists = cophenet(Z_complete , pdist(df_scaled_HClustering))
print(c)
Z_ward = linkage(df_scaled_HClustering, metric='euclidean', method='ward')
c, coph_dists = cophenet(Z_ward , pdist(df_scaled_HClustering))
print(c)
df5=df2.copy(deep= True)
df5.head()
df5['labels'] = labels
df5.dtypes
dfg1=df5[df5['labels']==0]
dfg2=df5[df5['labels']==1]
dfg3=df5[df5['labels']==2]
dfg1.head()
dfg2.head()
dfg3.head()
xg1=dfg1.drop('mpg',axis=1)
yg1=dfg1[['mpg']]
xg2=dfg2.drop('mpg',axis=1)
yg2=dfg2[['mpg']]
xg3=dfg3.drop('mpg',axis=1)
yg3=dfg3[['mpg']]
xtrain1, xtest1, ytrain1, ytest1 = train_test_split(xg1,yg1, test_size=0.30, random_state=1)
xtrain2, xtest2, ytrain2, ytest2 = train_test_split(xg2,yg2, test_size=0.30, random_state=1)
xtrain3, xtest3, ytrain3, ytest3 = train_test_split(xg3,yg3, test_size=0.30, random_state=1)
regression_model = LinearRegression()
regression_model.fit(xtrain1, ytrain1)
print('In Sample score for group 1:', regression_model.score(xtrain1, ytrain1))
for idx, col_name in enumerate(xtrain1.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
regression_model.fit(xtrain2, ytrain2)
print('In sample score for group 2 :', regression_model.score(xtrain2, ytrain2))
for idx, col_name in enumerate(xtrain2.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
regression_model.fit(xtrain3, ytrain3)
print('In sample score for group 3 :' ,regression_model.score(xtrain3, ytrain3))
for idx, col_name in enumerate(xtrain3.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
df_scaled_HClustering['labels']=labels
df_scaled_HClustering.head()
dfg1=df_scaled_HClustering[df_scaled_HClustering['labels']==0]
dfg2=df_scaled_HClustering[df_scaled_HClustering['labels']==1]
dfg3=df_scaled_HClustering[df_scaled_HClustering['labels']==2]
dfg1.head()
dfg2.head()
dfg3.head()
xg1=dfg1.drop('mpg',axis=1)
yg1=dfg1[['mpg']]
xg2=dfg2.drop('mpg',axis=1)
yg2=dfg2[['mpg']]
xg3=dfg3.drop('mpg',axis=1)
yg3=dfg3[['mpg']]
xtrain1, xtest1, ytrain1, ytest1 = train_test_split(xg1,yg1, test_size=0.30, random_state=1)
xtrain2, xtest2, ytrain2, ytest2 = train_test_split(xg2,yg2, test_size=0.30, random_state=1)
xtrain3, xtest3, ytrain3, ytest3 = train_test_split(xg3,yg3, test_size=0.30, random_state=1)
regression_model = LinearRegression()
regression_model.fit(xtrain1, ytrain1)
print('In Sample score for group 1:', regression_model.score(xtrain1, ytrain1))
for idx, col_name in enumerate(xtrain1.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
regression_model.fit(xtrain2, ytrain2)
print('In sample score for group 2 :', regression_model.score(xtrain2, ytrain2))
for idx, col_name in enumerate(xtrain2.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
regression_model.fit(xtrain3, ytrain3)
print('In sample score for group 3 :' ,regression_model.score(xtrain3, ytrain3))
for idx, col_name in enumerate(xtrain3.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
df=pd.read_excel('Part2 - Company.xlsx')
df.head()
print('The size of the data:', df.size)
print('No of rows in the data:', df.shape[0])
print('No of columns in the data:', df.shape[1])
df2=df[['A','B','C','D']]
df2.head()
df2 = df2.convert_objects(convert_numeric=True)
df2.isnull().sum()
plt.figure(figsize = (15,7))
plt.title('Correlation of Attributes', y=1.05, size=19)
sns.heatmap(df2.corr(), cmap='plasma',annot=True, fmt='.2f')
df2.nunique()
five_point=df2.describe().T
five_point[['min','25%','50%','75%','max']]
sns.pairplot(df2, diag_kind='kde')
sns.jointplot(df['A'],df['B'])
sns.jointplot(df['C'],df['D'])
sns.boxplot(df['A'])
sns.boxplot(df['C'])
imp = SimpleImputer(strategy="most_frequent")
df3= imp.fit_transform(df)
pd.DataFrame(df3).to_csv("Imputeddata.csv", index= None)
df2=pd.read_csv('Imputeddata.csv' )
df2.head()
df2.columns=df.columns
df2.head()
df.isnull().sum()
df2.isnull().sum()
sns.countplot(df['Quality'])
df=pd.read_csv('Part3 - vehicle.csv')
print('The size of the data:', df.size)
print('No of rows in the data:', df.shape[0])
print('No of columns in the data:', df.shape[1])
df.head()
df.columns
x=df.drop('class',axis=1)
y=df[['class']]
x.head()
y.head()
x = x.convert_objects(convert_numeric=True)
x.isnull().sum()
medianFiller = lambda x: x.fillna(x.median())
x = x.apply(medianFiller,axis=0)
x.isnull().sum()
y.isnull().sum()
(x==0).all()
(y==0).all()
df=y.join(x,how='left')
df.head()
df.nunique()
plt.figure(figsize = (15,7))
plt.title('Correlation of Attributes', y=1.05, size=19)
sns.heatmap(x.corr(), cmap='plasma',annot=True, fmt='.2f')
df.describe().T
five_point=x.describe().T
five_point[['min','25%','50%','75%','max']]
sns.pairplot(df, diag_kind='kde')
sns.countplot(df['class'])
sns.jointplot(df['compactness'],df['scatter_ratio'], hue= df['class'])
sns.distplot(df['compactness'])
sns.stripplot(df['pr.axis_rectangularity'], df['circularity'], hue=df['class'])
sns.stripplot(df['max.length_aspect_ratio'], df['hollows_ratio'], hue=df['class'])
x_scaled=x.apply(zscore)
xtrain, xtest, ytrain, ytest = train_test_split(x_scaled,y, test_size=0.30, random_state=1)
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x]== predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
clf = svm.SVC(gamma=0.025, C=3)
t1=datetime.datetime.now()
clf.fit(xtrain , ytrain)
y_pred = clf.predict(xtest)
t2=datetime.datetime.now()
print('Time Taken:', t2-t1)
ytest2=ytest.to_numpy()
np.resize(ytest2,(254,))
print('Accuracy :', getAccuracy(ytest2 , y_pred))
#x_scaled=x.apply(zscore)
covMatrix = np.cov(x_scaled,rowvar=False)
#print(covMatrix)
pca = PCA(n_components=18)
pca.fit(x_scaled)
plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,19)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
pca3 = PCA(n_components=10)
pca3.fit(x_scaled)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
xpca = pca3.transform(x_scaled)
sns.pairplot(pd.DataFrame(xpca))
xtrain, xtest, ytrain, ytest = train_test_split(xpca,y,test_size=0.30, random_state=1)
t3=datetime.datetime.now()
clf.fit(xtrain , ytrain)
y_pred = clf.predict(xtest)
t4=datetime.datetime.now()
print('Time Taken :',t4-t3)
print('Accuracy:',getAccuracy(ytest2 , y_pred))
df=pd.read_csv('Part4 - batting_bowling_ipl_bat.csv')
df.head()
df=df.dropna()
df.reset_index(drop=True, inplace= True)
df.head()
print('The size of the data:', df.size)
print('No of rows in the data:', df.shape[0])
print('No of columns in the data:', df.shape[1])
df2=df.copy(deep=True)
df2.drop('Name',axis=1, inplace= True)
df2 = df2.convert_objects(convert_numeric=True)
df2.isnull().sum()
plt.figure(figsize = (15,7))
plt.title('Correlation of Attributes', y=1.05, size=19)
sns.heatmap(df2.corr(), cmap='plasma',annot=True, fmt='.2f')
five_point=df2.describe().T
five_point[['min','25%','50%','75%','max']]
sns.pairplot(df2, diag_kind='kde')
sns.jointplot(df2['SR'],df2['Runs'])
sns.stripplot(df['HF'],df['Runs'], jitter= True)
sns.stripplot(df['HF'],df['Sixes'], jitter= True)
sns.stripplot(df['HF'],df['Fours'], jitter= True)
sns.scatterplot(df['Fours'],df['Sixes'], hue=df['HF'])
sns.scatterplot(df['Runs'],df['SR'], hue=df['HF'])
sns.scatterplot(df['Runs'],df['Ave'], hue=df['HF'])
sns.boxplot(df['Ave'])
sns.boxplot(df['Runs'])
sns.boxplot(df['Sixes'])
sns.boxplot(df['Fours'])
df_scaled=df2.apply(zscore)
df_scaled.nunique()
#Finding optimal no. of clusters
clusters=range(1,10)
meanDistortions=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(df_scaled)
prediction=model.predict(df_scaled)
meanDistortions.append(sum(np.min(cdist(df_scaled, model.cluster_centers_, 'euclidean'), axis=1)) / df_scaled.shape[0])
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
model = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='average')
model.fit(df_scaled)
df['labels'] = model.labels_
df.head(10)
dfcluster = df.groupby(['labels'])
df3= dfcluster.mean()
df3
Rank1=df5[df5['labels']==3]
Rank2=df5[df5['labels']==4]
Rank3=df5[df5['labels']==2]
Rank4=df[df['labels']==0]
Rank5=df[df['labels']==1]
Z_average = linkage(df_scaled, metric='euclidean', method='average')
c, coph_dists = cophenet(Z_average , pdist(df_scaled))
print('cophenet index:', c)
image_raw = imread("test.jpg")
print(image_raw.shape)
# Displaying the image
plt.figure(figsize=[12,8])
plt.imshow(image_raw)
image_sum = image_raw.sum(axis=2)
print(image_sum.shape)
image_bw = image_sum/image_sum.max()
print(image_bw.max())
plt.figure(figsize=[12,8])
plt.imshow(image_bw, cmap=plt.cm.gray)
pca = PCA()
pca.fit(image_bw)
# Getting the cumulative variance
var_cumu = np.cumsum(pca.explained_variance_ratio_)*100
# How many PCs explain 95% of the variance?
k = np.argmax(var_cumu>95)
print("Number of components explaining 95% variance: "+ str(k))
plt.figure(figsize=[10,5])
plt.title('Cumulative Explained Variance explained by the components')
plt.ylabel('Cumulative Explained variance')
plt.xlabel('Principal components')
plt.axvline(x=k, color="k", linestyle="--")
plt.axhline(y=95, color="r", linestyle="--")
ax = plt.plot(var_cumu)
ipca = IncrementalPCA(n_components=k)
image_recon = ipca.inverse_transform(ipca.fit_transform(image_bw))
# Plotting the reconstructed image
plt.figure(figsize=[12,8])
plt.imshow(image_recon,cmap = plt.cm.gray)